# -*- coding: utf-8 -*-
import pandas as pd
import jieba
from settings import path

data = pd.read_csv(path.path_train_txt, names=['text', 'label'], sep='\t', encoding='utf')


# 进行分词预处理
def cut_sentence(s):
    return ' '.join(list(jieba.cut(s))[:30])


data['word'] = data['text'].apply(cut_sentence)  # 这个速度可能比下面那个更快
# data['word'] = list(map(cut_sentence, data['text']))
# print(data.head(10))

data.to_csv(path.path_train_processed_txt, index=False)



